import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('CR drop.csv')
df
| time | visits | conversions | |
|---|---|---|---|
| 0 | 0 | 17 | 4 |
| 1 | 1 | 18 | 4 |
| 2 | 2 | 12 | 4 |
| 3 | 3 | 16 | 6 |
| 4 | 4 | 24 | 11 |
| ... | ... | ... | ... |
| 163 | 163 | 30 | 7 |
| 164 | 164 | 34 | 10 |
| 165 | 165 | 31 | 12 |
| 166 | 166 | 27 | 5 |
| 167 | 167 | 27 | 8 |
168 rows × 3 columns
# Check missing value
df.isnull().sum()
time 0 visits 0 conversions 0 dtype: int64
# Check duplicate values
# Selecting duplicate based on all columns
duplicate = df[df.duplicated()]
print("Duplicate Rows :")
duplicate
Duplicate Rows :
| time | visits | conversions |
|---|
No missing or duplicated value
# Add conversion rate column
df['conversion_rate'] = round(df.conversions/df.visits,3)
# computing a 7 day rolling average
df['7day_rolling_avg'] = df['conversion_rate'].rolling(7).mean()
df['rolling_std'] = df['conversion_rate'].rolling(7).std()
df.head(10)
| time | visits | conversions | conversion_rate | 7day_rolling_avg | rolling_std | |
|---|---|---|---|---|---|---|
| 0 | 0 | 17 | 4 | 0.235 | NaN | NaN |
| 1 | 1 | 18 | 4 | 0.222 | NaN | NaN |
| 2 | 2 | 12 | 4 | 0.333 | NaN | NaN |
| 3 | 3 | 16 | 6 | 0.375 | NaN | NaN |
| 4 | 4 | 24 | 11 | 0.458 | NaN | NaN |
| 5 | 5 | 12 | 6 | 0.500 | NaN | NaN |
| 6 | 6 | 20 | 4 | 0.200 | 0.331857 | 0.118942 |
| 7 | 7 | 23 | 8 | 0.348 | 0.348000 | 0.111009 |
| 8 | 8 | 20 | 3 | 0.150 | 0.337714 | 0.126837 |
| 9 | 9 | 22 | 10 | 0.455 | 0.355143 | 0.134247 |
# Visualize the trend and moving average
plt.figure( figsize = ( 12, 6),dpi=80)
import plotly.express as px
fig = px.line(df, x="time", y=['conversion_rate', '7day_rolling_avg', 'rolling_std'], title='Daily Conversion Rate',template = 'plotly_dark')
fig.show()
<Figure size 960x480 with 0 Axes>
The rolling mean decrease overtime. Therefore, we can conclude that the time series is not stationary.
We can check whether the data is stationary or not using the Augmented Dickey-Fuller (ADF) test.
The ADF test returns the stats, which contain the p-value in it. The p-value is used to determine whether the data is stationary or not. If the p-value is less than 0.05 then the data is stationary or if the data is greater than 0.05 then the data is non-stationary. The 0.05 is called the significance level which corresponds to 95%. You can also try out different significance levels to test stationarity.
# Import libraries
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARMA
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
len(df)/2
84.0
# ETS Decomposition
result = seasonal_decompose(df['conversion_rate'],
model ='multiplicative', period = 84)
# ETS plot
result.plot()
# Create function to test stationarity of the variable
from statsmodels.tsa.stattools import adfuller
def test_stationarity(df, column='', signif=0.05, series=False):
if series:
adf_test = adfuller(df, autolag='AIC')
else:
adf_test = adfuller(df[column], autolag='AIC')
p_value = adf_test[1]
if p_value <= signif:
test_result = "Stationary"
else:
test_result = "Non-Stationary"
return test_result
# Test if converstion rate is stationary (mean is constant)
test_stationarity(df, 'conversion_rate')
'Non-Stationary'
# Convert the data to stationary
def differencing(df, column, order):
differenced_df = df[column].diff(order)
differenced_df.fillna(differenced_df.mean(), inplace=True)
return differenced_df
stationary_cr = differencing(df, 'conversion_rate', 1)
# Test again
test_stationarity(stationary_cr, series=True)
'Stationary'
# Add the stationary columns to the data & Visualize the result
df['new_cr'] = stationary_cr
df['new_rolling_avg'] = df['new_cr'].rolling(7).mean()
df['new_rolling_std'] = df['new_cr'].rolling(7).std()
# Visualize the stationary moving average and std
plt.figure( figsize = ( 12, 6),dpi=80)
import plotly.express as px
fig = px.line(df, x="time", y=['new_cr', 'new_rolling_avg', 'new_rolling_std'], title='Daily Conversion Rate',template = 'plotly_dark')
fig.show()
<Figure size 960x480 with 0 Axes>
After differencing the data has become stationary. Now we can use this data to train the time series model
Time-series data which is strictly sequential and has autocorrelation. We can’t use least-squares or linear regression directly here because of the autocorrelation of the residuals. The autocorrelation of the residuals would result in spurious predictions. That is why we use a special model constructed specially for time-series data.
Here, we are going to use ARMA (AutoRegressive Moving Average) model to forecast the data. The ARMA model has two parameters namely p and q. The p is for the AR (Auto Regression) and the q is for MA (Moving Average). The Auto Regression uses the previous lags to model the data and the Moving Average uses the previous forecast errors to model the data. We can combine the two complimentary views of modeling X_t into a single equation giving rise to the ARMA(p,q) model
Using historical data, we can select p and q in ARMA(p,q) model and learn the model coefficients {\theta} and {\phi} based on which we can make a future prediction. A substantial deviation from the prediction result in time-series anomaly. We can define substantial as two standard deviation from a moving average of X. The model parameteters {\theta} and {\phi} are learned via Maximum Likelihood Estimation (MLE) as implemented in the statsmodels package.
The conversion rate variable is non-seasonal we use ARMA over SARMA
Akaike Information Critera (AIC) is a widely used measure of a statistical model. It basically quantifies 1) the goodness of fit 2) the simplicity/parsimony of the model into a single statistic. The one with the lower AIC is generally “better”.
To find the best order (p, q ) for the model we have to select the order (p, q) that reduces the AIC for the model. I created a function to traverse through different values of order (p,q) and compare AIC against each other, the result will give use the best order (p,q) with the minimum AIC.
# Import libraries
from statsmodels.tsa.arima_model import ARIMA
# Find the best order (p, q )
max_p, max_q = 5, 5
import joblib
def get_order_sets(n, n_per_set) -> list:
n_sets = [i for i in range(n)]
order_sets = [
n_sets[i:i + n_per_set]
for i in range(0, n, n_per_set)
]
return order_sets
def find_aic_for_model(df,p, q, model, model_name):
try:
msg = f"Fitting {model_name} with order p, q = {p, q}n"
print(msg)
if p == 0 and q == 1:
# since p=0 and q=1 is already
# calculated
return None, (p, q)
ts_results = model(df, order=(p, q)).fit(disp=False)
curr_aic = ts_results.aic
return curr_aic, (p, q)
except Exception as e:
f"""Exception occurred continuing {e}"""
return None, (p, q)
def find_best_order_for_model(df, model, model_name):
p_ar, q_ma = max_p, max_q
final_results = []
ts_results = model(df, order=(0, 1)).fit(disp=False)
min_aic = ts_results.aic
final_results.append((min_aic, (0, 1)))
# example if q_ma is 6
# order_sets would be [[0, 1, 2, 3, 4], [5]]
order_sets = get_order_sets(q_ma, 5)
for p in range(0, p_ar):
for order_set in order_sets:
# fit the model and find the aic
results = joblib.Parallel(n_jobs=len(order_set), prefer='threads')(
joblib.delayed(find_aic_for_model)(df,p, q, model, model_name)
for q in order_set
)
final_results.extend(results)
results_df = pd.DataFrame(
final_results,
columns=['aic', 'order']
)
min_df = results_df[
results_df['aic'] == results_df['aic'].min()
]
min_aic = min_df['aic'].iloc[0]
min_order = min_df['order'].iloc[0]
return min_aic, min_order, results_df
min_aic, min_order, results_df = find_best_order_for_model(
stationary_cr, ARMA, "ARMA"
)
print(min_aic, min_order)
/Users/thaoduyentran/opt/anaconda3/lib/python3.9/site-packages/statsmodels/tsa/arima_model.py:472: FutureWarning:
statsmodels.tsa.arima_model.ARMA and statsmodels.tsa.arima_model.ARIMA have
been deprecated in favor of statsmodels.tsa.arima.model.ARIMA (note the .
between arima and model) and
statsmodels.tsa.SARIMAX. These will be removed after the 0.12 release.
statsmodels.tsa.arima.model.ARIMA makes use of the statespace framework and
is both well tested and maintained.
To silence this warning and continue using ARMA and ARIMA until they are
removed, use:
import warnings
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARMA',
FutureWarning)
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARIMA',
FutureWarning)
Fitting ARMA with order p, q = (0, 0)n Fitting ARMA with order p, q = (0, 1)nFitting ARMA with order p, q = (0, 2)n Fitting ARMA with order p, q = (0, 3)n Fitting ARMA with order p, q = (0, 4)n Fitting ARMA with order p, q = (1, 0)nFitting ARMA with order p, q = (1, 1)n Fitting ARMA with order p, q = (1, 2)n Fitting ARMA with order p, q = (1, 3)nFitting ARMA with order p, q = (1, 4)n Fitting ARMA with order p, q = (2, 0)nFitting ARMA with order p, q = (2, 1)n Fitting ARMA with order p, q = (2, 2)n Fitting ARMA with order p, q = (2, 3)n Fitting ARMA with order p, q = (2, 4)n
/Users/thaoduyentran/opt/anaconda3/lib/python3.9/site-packages/statsmodels/tsa/tsatools.py:701: RuntimeWarning: overflow encountered in exp /Users/thaoduyentran/opt/anaconda3/lib/python3.9/site-packages/statsmodels/tsa/tsatools.py:701: RuntimeWarning: invalid value encountered in true_divide /Users/thaoduyentran/opt/anaconda3/lib/python3.9/site-packages/statsmodels/tsa/tsatools.py:702: RuntimeWarning: overflow encountered in exp /Users/thaoduyentran/opt/anaconda3/lib/python3.9/site-packages/statsmodels/tsa/tsatools.py:702: RuntimeWarning: invalid value encountered in true_divide /Users/thaoduyentran/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/model.py:547: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available
Fitting ARMA with order p, q = (3, 0)nFitting ARMA with order p, q = (3, 1)n Fitting ARMA with order p, q = (3, 2)nFitting ARMA with order p, q = (3, 3)nFitting ARMA with order p, q = (3, 4)n
/Users/thaoduyentran/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/model.py:547: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available /Users/thaoduyentran/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/model.py:547: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available /Users/thaoduyentran/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/model.py:547: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available
Fitting ARMA with order p, q = (4, 0)nFitting ARMA with order p, q = (4, 1)nFitting ARMA with order p, q = (4, 2)n Fitting ARMA with order p, q = (4, 3)n Fitting ARMA with order p, q = (4, 4)n
/Users/thaoduyentran/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/model.py:547: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available /Users/thaoduyentran/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/model.py:566: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals /Users/thaoduyentran/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/model.py:547: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available
-342.8249615230079 (0, 1)
/Users/thaoduyentran/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/model.py:547: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available /Users/thaoduyentran/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/model.py:566: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
Now that we have found that (0, 1) is the best order we can use that to fit the ARMA model.
# Import libraries
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
# Train model
arma = ARMA(stationary_cr, order=min_order)
arma_fit = arma.fit()
/Users/thaoduyentran/opt/anaconda3/lib/python3.9/site-packages/statsmodels/tsa/arima_model.py:472: FutureWarning:
statsmodels.tsa.arima_model.ARMA and statsmodels.tsa.arima_model.ARIMA have
been deprecated in favor of statsmodels.tsa.arima.model.ARIMA (note the .
between arima and model) and
statsmodels.tsa.SARIMAX. These will be removed after the 0.12 release.
statsmodels.tsa.arima.model.ARIMA makes use of the statespace framework and
is both well tested and maintained.
To silence this warning and continue using ARMA and ARIMA until they are
removed, use:
import warnings
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARMA',
FutureWarning)
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARIMA',
FutureWarning)
This problem is unconstrained.
RUNNING THE L-BFGS-B CODE
* * *
Machine precision = 2.220D-16
N = 2 M = 12
At X0 0 variables are exactly at the bounds
At iterate 0 f= -1.03015D+00 |proj g|= 3.24924D+00
At iterate 5 f= -1.03056D+00 |proj g|= 1.54136D-01
At iterate 10 f= -1.03077D+00 |proj g|= 2.32201D+00
At iterate 15 f= -1.03707D+00 |proj g|= 5.25967D+00
At iterate 20 f= -1.03792D+00 |proj g|= 5.05629D+00
At iterate 25 f= -1.03817D+00 |proj g|= 2.02508D-02
At iterate 30 f= -1.03817D+00 |proj g|= 4.23754D-03
At iterate 35 f= -1.03817D+00 |proj g|= 2.22045D-08
At iterate 40 f= -1.03817D+00 |proj g|= 1.60982D-05
* * *
Tit = total number of iterations
Tnf = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip = number of BFGS updates skipped
Nact = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F = final function value
* * *
N Tit Tnf Tnint Skip Nact Projg F
2 40 63 1 0 0 1.610D-05 -1.038D+00
F = -1.0381695283422854
CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH
Warning: more than 10 function and gradient evaluations in the last line search. Termination may possibly be caused by a bad search direction.
# Add the fitted/predicted value to df
df['predictions_ARMA'] = arma_fit.fittedvalues
plt.figure( figsize = ( 12, 6),dpi=80)
fig = px.line(df, x="time", y=['conversion_rate', 'predictions_ARMA'], title='Daily Conversion Rate',template = 'plotly_dark')
fig.show()
<Figure size 960x480 with 0 Axes>
# Interpret the model
arma_fit.summary()
| Dep. Variable: | conversion_rate | No. Observations: | 168 |
|---|---|---|---|
| Model: | ARMA(0, 1) | Log Likelihood | 174.412 |
| Method: | css-mle | S.D. of innovations | 0.084 |
| Date: | Mon, 02 May 2022 | AIC | -342.825 |
| Time: | 01:08:03 | BIC | -333.453 |
| Sample: | 0 | HQIC | -339.021 |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | -0.0007 | 0.000 | -5.560 | 0.000 | -0.001 | -0.000 |
| ma.L1.conversion_rate | -0.9999 | 0.032 | -31.165 | 0.000 | -1.063 | -0.937 |
| Real | Imaginary | Modulus | Frequency | |
|---|---|---|---|---|
| MA.1 | 1.0001 | +0.0000j | 1.0001 | 0.0000 |
P>|z| are the p-values of the coefficients, they are both less than 0.05 so we have enough confidence to conclude that its prediction is statistically significant
The AIC is minimum as we optimize the order (p,q)
Residual is the difference between the actual conversion rate and the fitted values by ARMA Times Series Model
# Add residual column
df['residual'] = arma_fit.resid
# Plot residual errors
residuals = arma_fit.resid
fig, ax = plt.subplots(1,2)
residuals.plot(title="Residuals", ax=ax[0])
residuals.plot(kind='kde', title='Density', ax=ax[1])
plt.show()
The residuals follows normal distribution with the bell curve
Create find anomalies function based on threshold. Here, the threshold is defined as mean squared error plus one standard deviation of squared errors to detect unusual spikes or drops that fall above or below the usual mean squared error and one std of the conversion rate from the ARMA times series model
We can calculate the threshold differently based on discussion with stakeholders, business goals
# Create function to find anomalies (unusual drop/ spike in conversion rate)
import numpy as np
squared_errors = arma_fit.resid**2
def find_anomalies(squared_errors):
threshold = np.mean(squared_errors) + np.std(squared_errors)
anomaly = (squared_errors >= threshold).astype(int)
return anomaly, threshold
# Detect anomalies by the function above
anomaly, threshold = find_anomalies(squared_errors)
# Create anomaly column
df['anomaly'] = anomaly
# Visualize the anomalies
import plotly.graph_objects as go
# plot value on y-axis and date on x-axis
fig = px.line(df, x=df.time, y="conversion_rate", title='Conversion Rate - ARMA Anomaly Detection', template = 'plotly_dark')
# create list of outlier_time
outlier_time = df[df['anomaly']==1].time
# obtain y value of anomalies to plot
y_values = [df.loc[i]['conversion_rate'] for i in outlier_time]
fig.add_trace(go.Scatter(x=outlier_time, y=y_values, mode = 'markers',
name = 'Anomaly',
marker=dict(color='red',size=10)))
fig.show()
# Display anomalies
df_anomaly = df[df['anomaly']==1]
df_anomaly = df_anomaly[['time','conversion_rate','predictions_ARMA','anomaly']]
df_anomaly
| time | conversion_rate | predictions_ARMA | anomaly | |
|---|---|---|---|---|
| 4 | 4 | 0.458 | -0.097292 | 1 |
| 5 | 5 | 0.500 | -0.150983 | 1 |
| 8 | 8 | 0.150 | -0.028851 | 1 |
| 9 | 9 | 0.455 | 0.151495 | 1 |
| 13 | 13 | 0.455 | -0.007146 | 1 |
| 22 | 22 | 0.600 | 0.014107 | 1 |
| 27 | 27 | 0.552 | 0.041189 | 1 |
| 29 | 29 | 0.455 | -0.021045 | 1 |
| 37 | 37 | 0.450 | 0.029117 | 1 |
| 38 | 38 | 0.103 | -0.148728 | 1 |
| 39 | 39 | 0.438 | 0.192575 | 1 |
| 40 | 40 | 0.500 | -0.139691 | 1 |
| 56 | 56 | 0.115 | -0.076022 | 1 |
| 70 | 70 | 0.481 | 0.058574 | 1 |
| 84 | 84 | 0.118 | 0.076808 | 1 |
| 165 | 165 | 0.387 | -0.091726 | 1 |
The table above shows us the estimates of the conversion rate(s) and the estimated day of changes
Based on the graphs, the estimate time and conversion rate of the drops along with their CI bounds are:
drop_time = [8,38,56,84]
df_anomaly[df_anomaly['time'].isin(drop_time)]
| time | conversion_rate | predictions_ARMA | anomaly | |
|---|---|---|---|---|
| 8 | 8 | 0.150 | -0.028851 | 1 |
| 38 | 38 | 0.103 | -0.148728 | 1 |
| 56 | 56 | 0.115 | -0.076022 | 1 |
| 84 | 84 | 0.118 | 0.076808 | 1 |
# Get the list of indexes that contains anomalies/changes
idx = df.index.get_indexer_for(df[df.anomaly==1].index)
# Create a new data set that contains changes and their before and after records
new = df.iloc[np.unique(np.concatenate([np.arange(max(i-1,0), min(i+1+1, len(df)))
for i in idx]))]
new = new[['time','conversion_rate','predictions_ARMA','anomaly']]
# Index that have the change
idx
array([ 4, 5, 8, 9, 13, 22, 27, 29, 37, 38, 39, 40, 56,
70, 84, 165])
def rowStyle(row):
drop = [8,38,56,84]
spike = [4, 5, 9, 13, 22, 27, 29, 37, 39, 40, 70, 165]
if row.time in drop:
return ['background-color: red'] * len(row)
elif row.time in spike:
return ['background-color: lightgreen'] * len(row)
return [''] * len(row)
new.style.apply(rowStyle, axis=1)
| time | conversion_rate | predictions_ARMA | anomaly | |
|---|---|---|---|---|
| 3 | 3 | 0.375000 | -0.078691 | 0 |
| 4 | 4 | 0.458000 | -0.097292 | 1 |
| 5 | 5 | 0.500000 | -0.150983 | 1 |
| 6 | 6 | 0.200000 | -0.166154 | 0 |
| 7 | 7 | 0.348000 | 0.116375 | 0 |
| 8 | 8 | 0.150000 | -0.028851 | 1 |
| 9 | 9 | 0.455000 | 0.151495 | 1 |
| 10 | 10 | 0.190000 | -0.140290 | 0 |
| 12 | 12 | 0.316000 | -0.075899 | 0 |
| 13 | 13 | 0.455000 | -0.007146 | 1 |
| 14 | 14 | 0.286000 | -0.137143 | 0 |
| 21 | 21 | 0.294000 | 0.001521 | 0 |
| 22 | 22 | 0.600000 | 0.014107 | 1 |
| 23 | 23 | 0.231000 | -0.280471 | 0 |
| 26 | 26 | 0.263000 | 0.073481 | 0 |
| 27 | 27 | 0.552000 | 0.041189 | 1 |
| 28 | 28 | 0.333000 | -0.240006 | 0 |
| 29 | 29 | 0.455000 | -0.021045 | 1 |
| 30 | 30 | 0.280000 | -0.139171 | 0 |
| 36 | 36 | 0.269000 | 0.127664 | 0 |
| 37 | 37 | 0.450000 | 0.029117 | 1 |
| 38 | 38 | 0.103000 | -0.148728 | 1 |
| 39 | 39 | 0.438000 | 0.192575 | 1 |
| 40 | 40 | 0.500000 | -0.139691 | 1 |
| 41 | 41 | 0.348000 | -0.197628 | 0 |
| 55 | 55 | 0.371000 | -0.087626 | 0 |
| 56 | 56 | 0.115000 | -0.076022 | 1 |
| 57 | 57 | 0.400000 | 0.176136 | 0 |
| 69 | 69 | 0.225000 | 0.005161 | 0 |
| 70 | 70 | 0.481000 | 0.058574 | 1 |
| 71 | 71 | 0.188000 | -0.195424 | 0 |
| 83 | 83 | 0.190000 | 0.114471 | 0 |
| 84 | 84 | 0.118000 | 0.076808 | 1 |
| 85 | 85 | 0.259000 | 0.146338 | 0 |
| 164 | 164 | 0.294000 | -0.030538 | 0 |
| 165 | 165 | 0.387000 | -0.091726 | 1 |
| 166 | 166 | 0.185000 | -0.184360 | 0 |
Confidence Interval is calculated according to this formula:
CI = mean +/- t*(s/sqrt(n))
Confidence Level: 95% , alpha = 5%
mean: mean of all the estimates
t_0.025,15 = 2.131
s: standard deviation of residuals
n: sample size of all the estimates df: degree of freedom = n - 1 = 15
t-distribution table: https://www.sjsu.edu/faculty/gerstman/StatPrimer/t-table.pdf
# Calculate upper and lower CI for the estimated conversion rates of change
import math as m
cr_change = df_anomaly['conversion_rate']
upper_CI_change = round(cr_change.mean() + ( 2.131*cr_change.std()/m.sqrt(len(cr_change))),3)
lower_CI_change = round(cr_change.mean() - ( 2.131*cr_change.std()/m.sqrt(len(cr_change))),3)
print('CI of conversion rates on days of change:(',lower_CI_change, ',',upper_CI_change,')')
CI of conversion rates on days of change:( 0.3 , 0.477 )
# Get the records before and after days of change
before_after = new[new['anomaly']==0]
before_after
| time | conversion_rate | predictions_ARMA | anomaly | |
|---|---|---|---|---|
| 3 | 3 | 0.375 | -0.078691 | 0 |
| 6 | 6 | 0.200 | -0.166154 | 0 |
| 7 | 7 | 0.348 | 0.116375 | 0 |
| 10 | 10 | 0.190 | -0.140290 | 0 |
| 12 | 12 | 0.316 | -0.075899 | 0 |
| 14 | 14 | 0.286 | -0.137143 | 0 |
| 21 | 21 | 0.294 | 0.001521 | 0 |
| 23 | 23 | 0.231 | -0.280471 | 0 |
| 26 | 26 | 0.263 | 0.073481 | 0 |
| 28 | 28 | 0.333 | -0.240006 | 0 |
| 30 | 30 | 0.280 | -0.139171 | 0 |
| 36 | 36 | 0.269 | 0.127664 | 0 |
| 41 | 41 | 0.348 | -0.197628 | 0 |
| 55 | 55 | 0.371 | -0.087626 | 0 |
| 57 | 57 | 0.400 | 0.176136 | 0 |
| 69 | 69 | 0.225 | 0.005161 | 0 |
| 71 | 71 | 0.188 | -0.195424 | 0 |
| 83 | 83 | 0.190 | 0.114471 | 0 |
| 85 | 85 | 0.259 | 0.146338 | 0 |
| 164 | 164 | 0.294 | -0.030538 | 0 |
| 166 | 166 | 0.185 | -0.184360 | 0 |
# Calculate upper and lower CI for the estimated conversion rates before and after the days of change
cr_ba = before_after['conversion_rate']
upper_CI_ba = round(cr_ba.mean() + ( 2.131*cr_ba.std()/m.sqrt(len(cr_ba))),3)
lower_CI_ba = round(cr_ba.mean() - ( 2.131*cr_ba.std()/m.sqrt(len(cr_ba))),3)
print('CI of conversion rates before and after the change:(',lower_CI_ba, ',',upper_CI_ba,')')
CI of conversion rates before and after the change:( 0.247 , 0.31 )